# Installation
!pip install chembl_webresource_client

# import the Chemble database
from chembl_webresource_client.new_client import new_client

# list all available entities in this database
available_resources = [resource for resource in dir(new_client) if not resource.startswith('_')]

available_resources

import pandas as pd

# extract the "molecule" entity, and filter the wanted data by using pref_name
molecule = new_client.molecule
mols = molecule.filter(pref_name__iexact='aspirin')

mols_pd = pd.DataFrame(mols)
mols_pd

# We can get some specific columns
activity = new_client.activity
activities = activity.filter(target_chembl_id="CHEMBL235").filter(standard_type="IC50").only(['molecule_chembl_id','canonical_smiles','type','value'])

df_activity = pd.DataFrame(activities)
df_activity.head()

# We can also get many molecules by id
molecule = new_client.molecule
mols = molecule.filter(molecule_chembl_id__in=['CHEMBL25', 'CHEMBL192', 'CHEMBL27']).only(['molecule_chembl_id', 'pref_name','molecule_structures'])

mols_df = pd.DataFrame(mols)
mols_df

from IPython.display import SVG

# We can also display the image for any molecules we want
image = new_client.image
image.set_format('svg')
SVG(image.get('CHEMBL327767'))

# Use the IC50 dataset as an example:
activity = new_client.activity
activities = activity.filter(target_chembl_id="CHEMBL235").filter(standard_type="IC50")
df_activities = pd.DataFrame(activities)

pd.set_option('display.max_columns', None) # use this to show all columns

df_activities.head()

# Use "to_numeric" to convert all data in the "standard_value" column to numeric type
df_activities["standard_value"] = pd.to_numeric(df_activities["standard_value"], errors="coerce")  # errors="coerce" means convert it into NaN

df_activities["standard_value"]

# drop all rows which contains NaN values for "standard_value"
df_activities = df_activities.dropna(subset=["standard_value"])

df_activities["standard_value"]

# First, we can define a DataFrame which has very different scales
data_example = pd.DataFrame({
    "feature_1": [0.5, 0.6, 0.7, 0.8],
    "feature_2": [200, 200, 300, 400]
    })

from sklearn.preprocessing import MinMaxScaler
import numpy as np

# Convert the dataframe into array
data_examplea_arr = data_example.values
print("before normalization: \n", data_examplea_arr)

# Use "Min-max scaling" method to do the feature scaling
min_max_scaler = MinMaxScaler(feature_range=(-1, 1)) # scaled to (0, 1)---check
data_example_scaled_minmax = min_max_scaler.fit_transform(data_examplea_arr) # input our data
print("after normalization: \n", data_example_scaled_minmax)

from sklearn.preprocessing import StandardScaler

print("before Standardization: \n", data_examplea_arr)

# Use "Standardization" method to do the feature scaling
std_scaler = StandardScaler()
data_example_std_scaled = std_scaler.fit_transform(data_examplea_arr) #input our data

print("after Standardization: \n", data_example_std_scaled)

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# plot the Histogram for IC50(standard_value column)
plt.figure(figsize=(8, 6))
sns.histplot(df_activities['standard_value'], bins=80, kde=True, color='skyblue')
plt.xlabel('Values')
plt.ylabel('Numbers of Districts')
plt.title('Histogram of IC50')
plt.show()

df_activities['pIC50'] = df_activities['standard_value'].apply(lambda x: -np.log10(x * 1e-9))
df_activities = df_activities[np.isfinite(df_activities['pIC50'])] # drop infinite values

# plot the Histogram for IC50 after Log Transformation
plt.figure(figsize=(8, 6))
sns.histplot(df_activities['pIC50'], bins=10, kde=True, color='pink')
plt.xlabel('Values')
plt.ylabel('Numbers of Districts')
plt.title('Histogram of pIC50')
plt.show()

import numpy as np
from scipy.spatial.transform import Rotation

# First generate a 3D dataset X
m = 60
X = np.zeros((m, 3))  # initialize 3D dataset
np.random.seed(42)
angles = (np.random.rand(m) ** 3 + 0.5) * 2 * np.pi  # uneven distribution
X[:, 0], X[:, 1] = np.cos(angles), np.sin(angles) * 0.5  # oval
X += 0.28 * np.random.randn(m, 3)  # add more noise
X = Rotation.from_rotvec([np.pi / 29, -np.pi / 20, np.pi / 4]).apply(X)
X += [0.2, 0, 0.2]  # shift a bit

X.shape   # .shape can display the dimensions of the dataset X

(60, 3)

from sklearn.decomposition import PCA

# Use the scikit-learn's PCA
pca = PCA(n_components=2)  # "n_components": numbers of reduced dimensions
X2D = pca.fit_transform(X)
X2D.shape

(60, 2)

pca.explained_variance_ratio_

array([0.7578477 , 0.15186921])

from sklearn.datasets import fetch_openml
# Let's use the dataset from the textbook
mnist = fetch_openml('mnist_784', as_frame=False, parser="auto")

# First we split the dataset into training set and test set
X_train, y_train = mnist.data[:60_000], mnist.target[:60_000]
X_test, y_test = mnist.data[60_000:], mnist.target[60_000:]

X_train.shape   # we can see it is a 784-Dimension dataset

(60000, 784)

pca = PCA()   # Define the PCA
pca.fit(X_train)   # Fit the dataset
cumsum = np.cumsum(pca.explained_variance_ratio_)   # "cumsum" will add up all explained_variance_ratio
d = np.argmax(cumsum >= 0.95) + 1    # "argmax" will find the d which can reach the maximum result under condition of (cumsum >= 0.95)
d

154

# Then we can use the "d" we get avbove, which is 154, to reduced the dimensions
pca = PCA(n_components=d)     # "n_components": numbers of reduced dimensions
X_train_reduced = pca.fit_transform(X_train)
X_train_reduced.shape   # we can see we succesefully get the reduced dataset

(60000, 154)

from sklearn.datasets import make_swiss_roll
from sklearn.manifold import LocallyLinearEmbedding
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap

# First lets make a swiss_roll dataset
X_swiss, t = make_swiss_roll(n_samples=1000, noise=0.2, random_state=42)

# Lets plot the swiss_roll dataset
fig = plt.figure(figsize=(6, 5))
ax = fig.add_subplot(111, projection='3d')
darker_hot = ListedColormap(plt.cm.hot(np.linspace(0, 0.8, 256)))
ax.scatter(X_swiss[:, 0], X_swiss[:, 1], X_swiss[:, 2], c=t, cmap=darker_hot)
ax.view_init(10, -70)
plt.show()

# Uses Scikit-Learn’s LocallyLinearEmbedding class to unroll it.
# "n_components": numbers of reduced dimensions
# "n_neighbors": number of neighbors
lle = LocallyLinearEmbedding(n_components=2, n_neighbors=10, random_state=42)

# Then we can use the defined LLE algorithm to unroll this swiss roll
X_unrolled = lle.fit_transform(X_swiss)

# Plot the unrolled dataset
plt.scatter(X_unrolled[:, 0], X_unrolled[:, 1], c=t, cmap=darker_hot)
plt.xlabel("$z_1$")
plt.ylabel("$z_2$", rotation=0)
plt.axis([-0.055, 0.060, -0.070, 0.090])
plt.grid(True)

plt.title("Unrolled swiss roll using LLE")
plt.show()

# Take an example using the mnist data from the textbook, MNIST is a dataset that all samples are images
X_train = mnist.data[:60000]
y_train = mnist.target[:60000]

# Take the first 5,000 images of the MNIST dataset
X_sample, y_sample = X_train[:5000], y_train[:5000]

X_sample.shape

(5000, 784)

from sklearn.manifold import TSNE

# The parameter n_components=2 indicates that the data will be reduced to 2 dimensions
# The init = "random" means how the initial positions of the data points are chosen before dimentionality reduction.
# The learning rate for t-SNE is usually in the range [10, 1000].
# The ‘auto’ option means automatically selects an appropriate learning rate based on the number of samples in the dataset.
# If the learning rate is too high, each data point can be vew as have equidstance from its nearest neighbours.
# If the learning rate is too low, most points may look compressed in a dense cloud.
tsne = TSNE(n_components=2, init="random", learning_rate="auto", random_state=42)

# Fit the t-SNE algorithm into the X_sample dataset
X_reduced = tsne.fit_transform(X_sample)

# let's check the dimention of this dataset after dimentionality reduction
X_reduced.shape

(5000, 2)

import matplotlib.pyplot as plt
import numpy as np

plt.figure(figsize=(12, 8))
plt.scatter(X_reduced[:, 0], X_reduced[:, 1], c=y_sample.astype(np.int8), cmap="jet", alpha=0.5)
plt.axis('off')
plt.colorbar()
plt.show()

from sklearn.decomposition import PCA

# define a PCA algorithm
pca = PCA(n_components=2, random_state=42)
X_pca_reduced = pca.fit_transform(X_sample)

# plot the scatter graph
plt.figure(figsize=(12, 8))
plt.scatter(X_pca_reduced[:, 0], X_pca_reduced[:, 1], c=y_sample.astype(np.int8), cmap="jet", alpha=0.5)
plt.axis('off')
plt.colorbar()
plt.show()

1 Dataset Collection¶

1.1 Available data entities¶

1.2 Available filters¶

1.3 Only operator¶

2 Data Processing¶

2.1 Cleaning¶

2.1.1 ".to_numeric()"¶

2.1.2 ".dropna()"¶

2.2 Normalization¶

2.2.1 Min-max scaling¶

2.2.2 Standardization¶

2.3 Transformation¶

3 Dimensionality Reduction [6]¶

3.1 Introduction¶

3.2 Projection learning¶

3.2.1 Principal component analysis (PCA)¶

3.3 Manifold learning¶

3.3.1 Locally linear embedding (LLE)¶

3.3.2 t-distributed stochastic neighbor embedding (t-SNE)¶

4 References¶